Random walk on the usage of Dataframes[Meta].jl

From Tidier.jl to DataFrames[Meta].jl After a rough view of Tidier.jl, I was impressed by its simplicity and high-level packing features. Tidier.jl would definitely make it easier for R users to adapt into Julia.

But after played with it for a while, I realized that it generally make some key features in DataFrames.jl (for example, Symbol, broadcast) implicit, which is not good for me when trying to think as a Julian.

So I decide to go back to DataFrames.jl and DataFramesMeta.jl for datascience-related-work. Below are some draft scripts that familiarize myself with the operations, just for records.

julia

using DataFrames
using DataFramesMeta
using RDatasets

# Learn DataFrames + DataFramesMeta workflow
# In compare with Tidier.jl
movies = dataset("ggplot2", "movies")

julia

Select

julia

# Select the first 5 columns by name:
# in tidier:
# @chain movies begin
#     @select(Title, Year, Length, Budget, Rating)
#     @slice(1:5)
# end

movies[1:5, names(movies)[1:5]]
movies[1:5, Between(:Title,:Rating)]

@chain movies begin
    # @select(:Title, :Year, :Length, :Budget, :Rating)
    @select(Between(:Title,:Rating))
    first(5)
end

# Select all but the first 5 columns [by name]

# @chain movies begin
#     @select(-(Title:Rating)) # or !()
#     @select(1:5)
#     @slice(1:5)
# end

movies[1:5, Not(Between(:Title, :Rating))]

julia

rename

julia

# in tidier:
# @chain movies begin
#     @rename(title = Title, Minutes = Length)
#     @select(1:5)
#     @slice(1:5)
# end

@chain movies begin
    @rename(:title = :Title, :Minutes = :Length)
    @select(Between(1,5))
    first(5)
end

@chain movies begin
    @transform(:title = :Title, :Minutes = :Length)
    @select(Between(1,5))
    first(5)
end

julia

mutate (transform)

julia

# DataFramesMeta do not auto broadcast functions
@chain movies begin
    @subset( .!(ismissing.(:Budget)))
    @transform(:Budget = :Budget ./ 1_000_000)
    @select(:Title, :Budget)
    first(5)
end

@chain movies begin
    @subset(.!(ismissing.(:Budget)))
    # wrapp the range in vector so that it won't be broadcasted
    @transform(:Nineties = :Year .∈ [1990:1999])
    @select(:Title, :Year, :Nineties)
    first(5)
end

# using @r... version macros to do per-row manipulate to avoid broadcast
@chain movies begin
    @rsubset(!ismissing(:Budget))
    @rtransform(:Nineties = :Year ∈ 1990:1999)
    @select(:Title, :Year, :Nineties)
    first(5)
end

julia

summarize

julia

# summarize df with combine
using StatsBase
@chain movies begin
    @combine($AsTable = (median_budget = median(skipmissing(:Budget)),
                    mean_budget = mean(skipmissing(:Budget)))
            )
end

@chain movies begin
    @groupby(:Year)
    @combine(:sum = sum(skipmissing(:Budget)))
    @orderby(-:sum)
end

# ungroup gdf use combine 
gm = @groupby(movies, :Year)
combine(gm, All())
@combine(gm, $(All()))

julia

filter (subset)

julia

# filter with @subset or @rsubset
@chain movies begin
    @transform(:Budget = :Budget / 1000000)
    @subset(:Budget .>= mean(skipmissing(:Budget))) 
    # since we need the whole vector of :Budget to calc mean, 
    # we may not use byrow version macros
    @select(:Title, :Budget)
    first(5)
end

# use byrow, use multi-filter
@chain movies begin
    # the following four are the same:
    @rsubset(:Votes >= 200 && :Rating >= 8) # use @rsubset to auto-broadcast
    @rsubset((:Votes >= 200) & (:Rating >= 8)) # bitwise & has a higher precedence than `>=`, so the filter should be wrapped
    @rsubset(:Votes >= 200, :Rating >= 8) # multi-filter are treated as AND
    @subset(:Votes .>= 200 .&& :Rating .>= 8)

    @select(:Title, :Votes, :Rating)
    first(5)
end

# filter with rownumber
# do we really need to wrap this into @chain workflow?
# the index version generally works fine!
movies[1:2:10,:]

# remove duplicates
df = DataFrame(a = 1:10, b = repeat('a':'e', inner = 2))
unique(df, :b)

# apply multiple variables and multiple functions
# use broadcast to mimic across() in Tidier.jl
# use $ to escape in macros
@chain movies begin
    # combine([:Rating, :Budget] .=> [mean∘skipmissing median∘skipmissing])
    @combine($([:Rating, :Budget] .=> [mean∘skipmissing median∘skipmissing]))
end

julia

Control flow

julia

# if_else() and case_when() in Tidier.jl:
# note: @passmissing should only work witin row-wise macros
df = DataFrame(a = [1, 2, missing, 4, 5])

# if_else
using Missings
@chain df begin
    @rtransform @passmissing :b = :a >= 3 ? "yes" : "no"
    @transform( :b = Missings.replace(:b, "unknown") |> collect ) 
end

# case_when
switch(f, x...) = f(x...)
@chain df begin
    @rtransform :b = switch(:a) do x
        ismissing(x) && return "unknown"
        x > 4 && return "hi"
        x > 2 && return "medium"
        x > 0 && return "low"
        return "unknown"
    end
end

@chain df begin
    @rtransform :b = ismissing(:a) ? "unknown" :
                     :a > 4 ? "hi" :
                     :a > 2 ? "medium" :
                     :a > 0 ? "low" :
                     "unknown"
    end
end

julia

combine dataframes: vcat and hcat

julia

df1 = DataFrame(A=1:3, B=1:3)
df2 = DataFrame(A=4:6, B=4:6)
df3 = DataFrame(A=7:9, C=7:9)
df4 = DataFrame()

# vcat
vcat(df1, df2)
vcat(df1, df3) # error if use default cols=:setequal
vcat(df1, df3, cols = :union)
vcat(df1, df3, cols = :intersect)
# use source to indicate the origins:
vcat(df1, df2, df3, df4, cols=:union, source="source")
vcat(df1, df2, source="source")

# hcat
hcat(df1, df2) # duplicate colnames cause error, use makeunique=true to make unique colnames
hcat(df1, df2, makeunique = true)

julia

seperate and combine column strings

Warn That is someting I thing TidierData.jl makes it easier. TidierData.jl: @separate, @unite, @seperate_rows macros works good, no need to build my own. but these macros use bare word as colnames.
julia

df = DataFrame(a = ["1-1", "2-2", "3-3-3"]);
using TidierData: @separate, @separate_rows, @unite
@chain df begin
    @separate(a, (b,c, d), "-")
end

julia

Deal with multi cols using @astable and AsTable

julia

df = DataFrame(a = [1, 2, 3], b = [400, 500, 600]);
@chain df begin
    @transform @astable begin
        ex = extrema(:b)
        :b_first = :b .- first(ex)
        :b_last = :b .- last(ex)
    end
end

# same as:
@chain df begin
    @transform(:bfirst = :b .- first(extrema(:b)), :blast = :b .- last(extrema(:b)))
end

# two pros for @astable:
# 1. create multiple cols from a single transformation
# 2. perform intermediate calculations and store results in reusable vars
#    (guarantee sequenctial transformation that DataFrames.jl do not)

# Manipulate muli-columns with AsTable()
df = DataFrame(a = [11, 14], b = [17, 10], c = [12, 5]);
vars = ["a", "b"];

@rtransform df :y = sum(AsTable(vars))
@rtransform df :y = sum(AsTable([:a, :b]))

function fun_with_new_name(x::NamedTuple)
    nms = string.(propertynames(x))
    new_name = Symbol(join(nms, "_"), "_sum")
    s = sum(x)
    (; new_name => s)
end

# function that return a NamedTuple will convert to table with AsTable
# AsTable can be used as both source and destination in "source => transformation => destination" operation,
# in DataFramesMeta macros, that means "destination = transformation(source)"
# when use AsTable as source, it signals that the columns selected by the wrapped selector cols
#      should be passed as a NamedTuple to the function.
# when use AsTable as destination, it means the result of the transformation operation
#      is a vector of containers (or a single container if ByRow(transformation) is used)
#      that should be expanded into multiple columns using keys to get column names.
@rtransform df $AsTable = fun_with_new_name(AsTable([:a, :b]))
@rsubset df sum(AsTable(vars)) > 500

julia